In [1]:
import librosa
from librosa import display
from PIL import Image
from matplotlib import pyplot
from numpy import asarray
import matplotlib.pyplot as plt
import numpy as np
import cmath
import seaborn as sns
import scipy
import IPython.display as ipd
import math
from numpy.linalg import inv
import torch 
import torchvision
from torchvision import datasets
import numpy as np
import time
import numpy
import matplotlib.pyplot as plt
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Performing STFT and uploading the audio files

In [2]:
train_clean_male, sr1 = librosa.load("train_clean_male.wav",sr=None)
S=librosa.stft(train_clean_male,n_fft=1024,hop_length=512)
ipd.display(ipd.Audio(train_clean_male,rate=16000))
In [3]:
sn,sr2=librosa.load("train_dirty_male.wav",sr=None)
X=librosa.stft(sn,n_fft=1024,hop_length=512)
ipd.display(ipd.Audio(sn,rate=16000))

Taking Absolute Values of X

In [4]:
mod_S=np.abs(S)
mod_X=np.abs(X)

print(np.shape(mod_S))
(513, 2459)

Defining Network with three hidden layers and batch Size

In [5]:
input_size1 = 513
hidden_size1 = [1024,1024,1024]
num_classes1 = 513
num_epochs = 500
batch_size1 = 128

Class for Neural Network

In [6]:
class ThreeLayer_NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes,initialization,activation,dropout):
        super(ThreeLayer_NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size[0]) 
        if initialization =='xavier':
          torch.nn.init.xavier_normal_(self.fc1.weight)
          torch.nn.init.zeros_(self.fc1.bias)
        if initialization == 'normal':
          torch.nn.init.normal_(self.fc1.weight,0, 0.01)
          torch.nn.init.zeros_(self.fc1.bias)
        if initialization == 'Kaiming':
          torch.nn.init.kaiming_normal_(self.fc1.weight)
          torch.nn.init.zeros_(self.fc1.bias)

        self.fc2 = nn.Linear(hidden_size[0], hidden_size[1]) 
        if initialization =='xavier':
          torch.nn.init.xavier_normal_(self.fc2.weight)
          torch.nn.init.zeros_(self.fc2.bias)
        if initialization == 'normal':
          torch.nn.init.normal_(self.fc2.weight,0, 0.01)
          torch.nn.init.zeros_(self.fc2.bias)
        if initialization == 'Kaiming':
          torch.nn.init.kaiming_normal_(self.fc2.weight)
          torch.nn.init.zeros_(self.fc2.bias)

        self.fc3 = nn.Linear(hidden_size[1], hidden_size[2]) 
        if initialization =='xavier':
          torch.nn.init.xavier_normal_(self.fc3.weight)
          torch.nn.init.zeros_(self.fc3.bias)
        if initialization == 'normal':
          torch.nn.init.normal_(self.fc3.weight,0, 0.01)
          torch.nn.init.zeros_(self.fc3.bias)
        if initialization == 'Kaiming':
          torch.nn.init.kaiming_normal_(self.fc3.weight)
          torch.nn.init.zeros_(self.fc3.bias) 

        self.fc4 = nn.Linear(hidden_size[2], num_classes)
        if initialization =='xavier':
          torch.nn.init.xavier_normal_(self.fc4.weight)
          torch.nn.init.zeros_(self.fc4.bias)
        if initialization == 'normal':
          torch.nn.init.normal_(self.fc4.weight,0, 0.01)
          torch.nn.init.zeros_(self.fc4.bias)
        if initialization == 'Kaiming':
          torch.nn.init.kaiming_normal_(self.fc4.weight)
          torch.nn.init.zeros_(self.fc4.bias)  

        if activation=="relu":
          self.act= nn.ReLU()
        if activation=="sigmoid":
          self.act=nn.Sigmoid()
        if activation=="tanh":
          self.act=nn.Tanh()

        if dropout== "Yes":
          self.firstlayer_dropout=torch.nn.Dropout(p=0.4)
          self.hiddenlayer_dropout=torch.nn.Dropout(p=0.4)
        
        if dropout== "No":
          self.firstlayer_dropout=torch.nn.Dropout(p=0)
          self.hiddenlayer_dropout=torch.nn.Dropout(p=0)
        
        self.act1=nn.ReLU()

    
    def forward(self, x):
        x=torch.transpose(x, 0, 1).to(device)
        out1 = self.fc1(x)
        out2 = self.hiddenlayer_dropout(self.act(out1))
        out3 = self.fc2(out2)
        out4 = self.hiddenlayer_dropout(self.act(out3))
        out5 = self.fc3(out4)
        out6 = self.hiddenlayer_dropout(self.act(out5))
        out7 = self.fc4(out6)
        out8 = self.act1(out7)
        return out8
In [7]:
model1 = ThreeLayer_NeuralNet(input_size1, hidden_size1,num_classes1,"Kaiming","relu","No").cuda()

# Loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model1.parameters(),lr=0.0003) 
In [8]:
errt=[0 for i in range(200)]
for epoch in range(200):
    running_loss=0
    for j in range(19):
        # Move tensors to the configured device
        if (j+1)*128 <= 2459:
          images= torch.tensor(mod_X[:,j*128:(j+1)*128],device=device)
          labels=torch.tensor(mod_S[:,j*128:(j+1)*128],device=device)
        else:
          images=torch.tensor(mod_X[:,j*128:2459],device=device)
          labels = torch.tensor(mod_S[:,j*128:2459],device=device)
        
        # Forward pass
        outputs = model1(images).cuda()
        loss = criterion(outputs.cuda(), torch.transpose(labels,0,1))
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
    
    errt[epoch]=running_loss/19
    print("Epoch:", epoch,"Loss:",running_loss/19)
Epoch: 0 Loss: 0.07171502728995524
Epoch: 1 Loss: 0.0334648538969065
Epoch: 2 Loss: 0.02266844165952582
Epoch: 3 Loss: 0.018043886576043933
Epoch: 4 Loss: 0.015471538798393388
Epoch: 5 Loss: 0.014019278956479147
Epoch: 6 Loss: 0.013102388058445956
Epoch: 7 Loss: 0.01257520110199326
Epoch: 8 Loss: 0.011817853775267539
Epoch: 9 Loss: 0.01133059688206566
Epoch: 10 Loss: 0.01081573066154593
Epoch: 11 Loss: 0.010460123125659792
Epoch: 12 Loss: 0.010057754152895589
Epoch: 13 Loss: 0.0097488936428961
Epoch: 14 Loss: 0.009357415507302472
Epoch: 15 Loss: 0.00902899781143979
Epoch: 16 Loss: 0.008718149246353852
Epoch: 17 Loss: 0.008523733882037433
Epoch: 18 Loss: 0.008633141307846495
Epoch: 19 Loss: 0.008516664114339571
Epoch: 20 Loss: 0.008272554872459486
Epoch: 21 Loss: 0.008095532741495654
Epoch: 22 Loss: 0.007762569443959939
Epoch: 23 Loss: 0.007436632583113878
Epoch: 24 Loss: 0.00716590908306994
Epoch: 25 Loss: 0.006884807120322397
Epoch: 26 Loss: 0.006768954059991397
Epoch: 27 Loss: 0.0066116912241436935
Epoch: 28 Loss: 0.006413369672372937
Epoch: 29 Loss: 0.006288785806023761
Epoch: 30 Loss: 0.006388236793052209
Epoch: 31 Loss: 0.006418746001528282
Epoch: 32 Loss: 0.0065476607751885525
Epoch: 33 Loss: 0.006884085320818581
Epoch: 34 Loss: 0.0065526679329770176
Epoch: 35 Loss: 0.006273906812758038
Epoch: 36 Loss: 0.006084442677858628
Epoch: 37 Loss: 0.005897226584094919
Epoch: 38 Loss: 0.005784239718004277
Epoch: 39 Loss: 0.005555789966724421
Epoch: 40 Loss: 0.005485920811464128
Epoch: 41 Loss: 0.005636435185902212
Epoch: 42 Loss: 0.005460524968312759
Epoch: 43 Loss: 0.005637679040726078
Epoch: 44 Loss: 0.005596441976529987
Epoch: 45 Loss: 0.0054722119514879425
Epoch: 46 Loss: 0.005896151470216482
Epoch: 47 Loss: 0.005592556694816602
Epoch: 48 Loss: 0.005870650742987269
Epoch: 49 Loss: 0.0059181639532509605
Epoch: 50 Loss: 0.006111461705969353
Epoch: 51 Loss: 0.0065285382351200835
Epoch: 52 Loss: 0.00653738682893546
Epoch: 53 Loss: 0.006996135276399161
Epoch: 54 Loss: 0.0068921979369693685
Epoch: 55 Loss: 0.006915093485364004
Epoch: 56 Loss: 0.006590827208894648
Epoch: 57 Loss: 0.005986897519936687
Epoch: 58 Loss: 0.005505527234881332
Epoch: 59 Loss: 0.005486250095265477
Epoch: 60 Loss: 0.005328546118873514
Epoch: 61 Loss: 0.005201273413963224
Epoch: 62 Loss: 0.0050582636245771456
Epoch: 63 Loss: 0.005050861264431947
Epoch: 64 Loss: 0.00482230298956366
Epoch: 65 Loss: 0.004573883831893143
Epoch: 66 Loss: 0.0045258559801272654
Epoch: 67 Loss: 0.00456396007518235
Epoch: 68 Loss: 0.0044648739827894855
Epoch: 69 Loss: 0.0044034052591182685
Epoch: 70 Loss: 0.00425111406825875
Epoch: 71 Loss: 0.0040337313824382265
Epoch: 72 Loss: 0.003864407282028543
Epoch: 73 Loss: 0.0037427281801539814
Epoch: 74 Loss: 0.0038831927200877353
Epoch: 75 Loss: 0.0037952308224416093
Epoch: 76 Loss: 0.003864793431651043
Epoch: 77 Loss: 0.0036816170353344397
Epoch: 78 Loss: 0.003452147130462292
Epoch: 79 Loss: 0.003448166069574654
Epoch: 80 Loss: 0.003304292343703932
Epoch: 81 Loss: 0.003222340834327042
Epoch: 82 Loss: 0.003224544882725336
Epoch: 83 Loss: 0.003223270030790254
Epoch: 84 Loss: 0.0031740863158024454
Epoch: 85 Loss: 0.0034535986947287853
Epoch: 86 Loss: 0.0034705428651681074
Epoch: 87 Loss: 0.0036690657419201577
Epoch: 88 Loss: 0.0037671017558558993
Epoch: 89 Loss: 0.003790103033871243
Epoch: 90 Loss: 0.003989822776546996
Epoch: 91 Loss: 0.004589476708420797
Epoch: 92 Loss: 0.004728387624613549
Epoch: 93 Loss: 0.0046698681912139845
Epoch: 94 Loss: 0.004900326535693909
Epoch: 95 Loss: 0.005195836303755641
Epoch: 96 Loss: 0.005314406252613193
Epoch: 97 Loss: 0.005819573802383323
Epoch: 98 Loss: 0.0063761204946786165
Epoch: 99 Loss: 0.005568674790035737
Epoch: 100 Loss: 0.005592766634531711
Epoch: 101 Loss: 0.005755231078518064
Epoch: 102 Loss: 0.0049010241453192735
Epoch: 103 Loss: 0.005123832888018928
Epoch: 104 Loss: 0.005025721240886732
Epoch: 105 Loss: 0.0044462255471827165
Epoch: 106 Loss: 0.00512865099957899
Epoch: 107 Loss: 0.004826162782448687
Epoch: 108 Loss: 0.00492025038080388
Epoch: 109 Loss: 0.004917252710775325
Epoch: 110 Loss: 0.004628816533735708
Epoch: 111 Loss: 0.005345795658956233
Epoch: 112 Loss: 0.004307232735874622
Epoch: 113 Loss: 0.004310282057543334
Epoch: 114 Loss: 0.004191724902116938
Epoch: 115 Loss: 0.003794580581597984
Epoch: 116 Loss: 0.0039508656574119075
Epoch: 117 Loss: 0.0033007224004617647
Epoch: 118 Loss: 0.0037161641215023244
Epoch: 119 Loss: 0.003514501541901968
Epoch: 120 Loss: 0.003465157578160104
Epoch: 121 Loss: 0.003867959750718192
Epoch: 122 Loss: 0.003286667119123434
Epoch: 123 Loss: 0.004053371799129404
Epoch: 124 Loss: 0.003662712394112819
Epoch: 125 Loss: 0.0047323735933260695
Epoch: 126 Loss: 0.004833808928532035
Epoch: 127 Loss: 0.00527770790320478
Epoch: 128 Loss: 0.005150761038653161
Epoch: 129 Loss: 0.0047622965955126445
Epoch: 130 Loss: 0.00506189467973615
Epoch: 131 Loss: 0.004172915065857141
Epoch: 132 Loss: 0.004285307950340211
Epoch: 133 Loss: 0.0036278479046335348
Epoch: 134 Loss: 0.003921666966849252
Epoch: 135 Loss: 0.0032986871481529975
Epoch: 136 Loss: 0.003557765733842787
Epoch: 137 Loss: 0.003151810464547261
Epoch: 138 Loss: 0.003404441105790044
Epoch: 139 Loss: 0.0032250171613046213
Epoch: 140 Loss: 0.0032860681684197565
Epoch: 141 Loss: 0.003279718104749918
Epoch: 142 Loss: 0.0030754596656678537
Epoch: 143 Loss: 0.0033861521055529777
Epoch: 144 Loss: 0.0029082115874380656
Epoch: 145 Loss: 0.003347509387439411
Epoch: 146 Loss: 0.0028801755060588845
Epoch: 147 Loss: 0.0034898846091604547
Epoch: 148 Loss: 0.002923240676816357
Epoch: 149 Loss: 0.003473897054056196
Epoch: 150 Loss: 0.003068041087030188
Epoch: 151 Loss: 0.0034530777300364876
Epoch: 152 Loss: 0.0031586125750388753
Epoch: 153 Loss: 0.003283055542085908
Epoch: 154 Loss: 0.0030669840910520995
Epoch: 155 Loss: 0.0030545540350048164
Epoch: 156 Loss: 0.002966630750482804
Epoch: 157 Loss: 0.002737454843315247
Epoch: 158 Loss: 0.002736590422787949
Epoch: 159 Loss: 0.0024825060421502904
Epoch: 160 Loss: 0.0025765369339895094
Epoch: 161 Loss: 0.002259143450493483
Epoch: 162 Loss: 0.0023811542318741743
Epoch: 163 Loss: 0.00213884768125258
Epoch: 164 Loss: 0.0022496198745150316
Epoch: 165 Loss: 0.0020480176470683595
Epoch: 166 Loss: 0.002100450139933903
Epoch: 167 Loss: 0.001984756397034385
Epoch: 168 Loss: 0.0019402368496613282
Epoch: 169 Loss: 0.0019166944242131553
Epoch: 170 Loss: 0.001847961114866561
Epoch: 171 Loss: 0.0018583957571536303
Epoch: 172 Loss: 0.001785919051862469
Epoch: 173 Loss: 0.0017925332955967047
Epoch: 174 Loss: 0.0017909580277965258
Epoch: 175 Loss: 0.0017244523501415785
Epoch: 176 Loss: 0.0017726976412201399
Epoch: 177 Loss: 0.0017117453114080586
Epoch: 178 Loss: 0.0017768179287055606
Epoch: 179 Loss: 0.0017261579908479593
Epoch: 180 Loss: 0.001777478426351751
Epoch: 181 Loss: 0.0017987985016876145
Epoch: 182 Loss: 0.0017212412663196262
Epoch: 183 Loss: 0.0017876321033231523
Epoch: 184 Loss: 0.0017653078937559929
Epoch: 185 Loss: 0.0017581147533890448
Epoch: 186 Loss: 0.0017263131939168823
Epoch: 187 Loss: 0.0017844047519917552
Epoch: 188 Loss: 0.0017904603124694212
Epoch: 189 Loss: 0.0017980766409125767
Epoch: 190 Loss: 0.0018653547679270176
Epoch: 191 Loss: 0.0019092846870128262
Epoch: 192 Loss: 0.0019134889436444563
Epoch: 193 Loss: 0.0019867611546559552
Epoch: 194 Loss: 0.002064238847723525
Epoch: 195 Loss: 0.0020934444006630464
Epoch: 196 Loss: 0.00212753071498714
Epoch: 197 Loss: 0.0020752153604438432
Epoch: 198 Loss: 0.001999061652704289
Epoch: 199 Loss: 0.0020741901581028573
In [9]:
plt.figure()
plt.plot(errt)
plt.title('Convergence')
Out[9]:
Text(0.5, 1.0, 'Convergence')

Uploading test_01_x.wav

In [10]:
test_x_01,sr2=librosa.load("test_x_01.wav",sr=None)
ipd.display(ipd.Audio(test_x_01,rate=16000))

Performing STFT on test_01_x.wav and taking absolute value

In [11]:
testx01=librosa.stft(test_x_01,n_fft=1024,hop_length=512)
testx01_abs=np.abs(testx01)
print(np.shape(testx01_abs))
(513, 142)
In [12]:
new_outputs=model1(torch.tensor(testx01_abs)) 
new_outputs1=torch.transpose(new_outputs,0,1)
new_output2=torch.div(torch.mul(torch.tensor(testx01,device=device),new_outputs1),torch.abs(torch.tensor(testx01,device=device)))

Recovered Version of test_01_x.wav

In [13]:
recovered_test_01_x=(new_output2.data).cpu().numpy()
ipd.display(ipd.Audio(librosa.core.istft(recovered_test_01_x, hop_length=512),rate=16000))

Uploading test_02_x.wav

In [14]:
test_x_02,sr2=librosa.load("test_x_02.wav",sr=None)

Performing STFT on test_02_x.wav and taking absolute value

In [15]:
testx02=librosa.stft(test_x_02,n_fft=1024,hop_length=512)
testx02_abs=np.abs(testx02)
In [16]:
new_outputs=model1(torch.tensor(testx02_abs))
new_outputs1=torch.transpose(new_outputs,0,1)
new_output2=torch.div(torch.mul(torch.tensor(testx02,device=device),new_outputs1),torch.abs(torch.tensor(testx02,device=device)))

Recovered Version of test_02_x.wav

In [17]:
recovered_test_02_x=(new_output2.data).cpu().numpy()
ipd.display(ipd.Audio(librosa.core.istft(recovered_test_02_x, hop_length=512),rate=sr2))

Note: The architecture used for the Network has 3 hidden layers with Relu Activation being used with a learning rate of 0.00003 for 200 epochs